MacAddict 108

home *** CD-ROM | disk | FTP | other *** search

/ MacAddict 108 / MacAddict108.iso / Software / Internet & Communication / JunkMatcher 1.5.5.dmg / JunkMatcher.app / Contents / Resources / Engine / SiteDB.py < prev next >

Wrap

Python Source | 2005-06-01 | 20.6 KB | 619 lines

# # SiteDB.py # JunkMatcher # # Created by Benjamin Han on 2/1/05. # Copyright (c) 2005 Benjamin Han. All rights reserved. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. #!/usr/bin/env python # IMPORTANT: SiteDB can have only a single instance! import datetime, copy, string from consts import * from utilities import * from rwlock import * # to ensure thread-safety, since we will have only one global SiteDB _gTLD=sets.Set(openFile('%setc/gTLD' % ENGINE_PATH).read().split('\n')[:-1]) _gTLD.add('arpa') # to be 100% precise arpa is not a generic TLD _ccTLD=sets.Set(openFile('%setc/ccTLD' % ENGINE_PATH).read().split('\n')[:-1]) # SiteDB is implemented to accomodate multiple readers and (xor) single writer _siteDBRWLock = RWLock() class SiteDB (dict): """Bad sites collection: a tree ---------------------------- fn: the filename count: the # of unique sites (or leaf nodes) sizeLimit: the max capacity (count will be kept less than sizeLimit; 0 with no limit) Note: each node is a list [time, count, level, children] - time: # of seconds since epoch in UTC when the last time the node is updated - count: # of occurrences - level: depth in the tree (0 is the top-level, i.e., TLD) - children: a dictionary or None (if it's a leaf) """ def __init__ (self, fn, sizeLimit = 0): self.fn = fn self.sizeLimit = sizeLimit self.load() def __str__ (self): """Returns a string representation of the SiteDB; can be used with loadFromString() later.""" _siteDBRWLock.acquire_read() ret = '\n'.join(['%s\n%u %d' % ('.'.join(siteInfo[0]), siteInfo[1], siteInfo[2]) for siteInfo in self._iter()]) _siteDBRWLock.release() return ret def _load (self, s): """(INTERNAL) Load from a string 's' - NOT THREAD-SAFE!""" self.clear() self._count = 0 isName = True for l in filter(lambda l:len(l), map(string.strip, s.split('\n'))): if isName: site = l.split('.') isName = False else: l = l.split(' ') self._addOne(site, int(l[0]), int(l[1]), False) isName = True if self.sizeLimit and self._count >= self.sizeLimit: self._prune() def load (self): # NOTE: MULTIPLE THREADS MAY LOAD THE DB MULTIPLE TIMES! _siteDBRWLock.acquire_write() try: f = openFile(self.fn) except: return s = f.read() self._load(s) _siteDBRWLock.release() def loadFromString (self, s): # NOTE: MULTIPLE THREADS MAY LOAD THE DB MULTIPLE TIMES! _siteDBRWLock.acquire_write() try: self._load(s) except Exception, e: _siteDBRWLock.release() raise e _siteDBRWLock.release() def size (self): _siteDBRWLock.acquire_read() ret = self._count _siteDBRWLock.release() return ret def setSizeLimit (self, sizeLimit): """Set size limit of the SiteDB; returns True iff pruning occurs (and something did get removed during the pruning).""" _siteDBRWLock.acquire_write() ret = False try: if self.sizeLimit > sizeLimit: self.sizeLimit = sizeLimit oldSize = self._count self._prune() ret = (self._count != oldSize) else: self.sizeLimit = sizeLimit except Exception, e: printException(u'Exception in SiteDB.setSizeLimit()', e) _siteDBRWLock.release() return ret def _addOne (self, site, timeSec, numTimes, prune): """Add one site (a sequence of name components) to the tree - the site can be numerical or non-numerical, and only significant parts are kept: for non-numerical names with _gTLD, it's the last 2 components; for non-numerical names with _ccTLD, it's the last 3 components; for numerical names it's the 4 components. Pruning happens when necessary, if prune is set to True. ASSUMPTION: site has at least two components. """ tld = site[-1] level = 0 theDict = self isGTLD = tld in _gTLD isCCTLD = tld in _ccTLD if not isGTLD and not isCCTLD: try: int(tld) except: # bogus TLD return # numerical names - add from the first component forward for comp in site: if theDict is None: theDict = {} node[-1] = theDict if level < 3: node = theDict.setdefault(comp, [0, 0, level, {}]) theDict = node[-1] level += 1 else: node = theDict.setdefault(comp, [0, 0, level, None]) break if level != 3: # bogus IP address (must have 4 components) return else: if isGTLD: count = 1 else: count = 2 # non-numerical names - add from the last component backward for comp in site[::-1]: if theDict is None: theDict = {} node[-1] = theDict if level == count: node = theDict.setdefault(comp, [0, 0, level, None]) break else: node = theDict.setdefault(comp, [0, 0, level, {}]) theDict = node[-1] level += 1 node[1] += numTimes nodeTime = node[0] if nodeTime == 0: self._count += 1 # this is the first time the site is added node[0] = timeSec if prune and self.sizeLimit and self._count > self.sizeLimit: self._prune() elif nodeTime < timeSec: node[0] = timeSec # update to the latest time def addOne (self, site, timeSec, numTimes = 1, prune = True): """A thread-safe version of _addOne().""" _siteDBRWLock.acquire_write() ret = self._addOne(site, timeSec, numTimes, prune) _siteDBRWLock.release() return ret def getOne (self, site): """Try to add a site (a sequence of name components) into the collection; returns a tuple (l, n, b), where l is a list of not-yet-added but significant name components, n is the deepest node we can find, and b is a boolean: 1. l is None if the last component is bogus (not one of the valid TLDs) 2. n is None if the last component is not in the collection (in that case l is the full list of name components) 3. b is True iff site is *not* numerical Examples: 1. sites.getOne(('yahoo','com')) -> ((), n, True), where n is not None: 'yahoo.com' is in the collection; 2. sites.getOne(('www','yahoo','com')) -> ((), n, True), where n is not None: same as above; 3. sites.getOne(('cnet','com')) -> (('cnet'), n, True), where n is not None: there is already at least one .com site added, but cnet.com is not added before; 4. sites.getOne(('cnet','com')) -> (('cnet','com'), None, True): no .com site is added before; 5. sites.getOne(('rubbish','bogus')) -> (None, None, True); the last component is bogus. 6. sites.getOne(('128','2','3','4')) -> (('2','3','4'), None, False); at least one IP starting with 128 has been added before. """ tld = site[-1] theDict = self prevNode = None if tld in _gTLD: count = 2 elif tld in _ccTLD: count = 3 else: try: int(tld) count = 0 except: return None, None, True _siteDBRWLock.acquire_read() if count > 0: # non-numerical name for comp in site[::-1]: node = theDict.get(comp) if node is None: break else: count -= 1 prevNode = node theDict = node[-1] if theDict is None: break if prevNode is None: _siteDBRWLock.release() return site[-count:], None, True else: endPos = -(prevNode[2] + 1) _siteDBRWLock.release() return site[endPos-count:endPos], prevNode, True else: # numerical name for comp in site: try: int(comp) except: _siteDBRWLock.release() return None, None, True node = theDict.get(comp) if node is None: break else: count += 1 prevNode = node theDict = node[-1] if theDict is None: break _siteDBRWLock.release() return site[count:4], prevNode, False def addOneToNode (self, nameComps, node, notNumerical, timeSec, numTimes = 1): """Add one site with prefix/suffix in nameComps (a sequence; prefix if notNumerical is True, suffix otherwise) to node, so that in effect the complete name of the site is '.'.join(nameComps)+suffix (non-numeric), or prefix+'.'.join(nameComps) (numeric), where suffix/prefix is the name components collected from the root node to 'node'. Pruning happens when necessary. ASSUMPTION: nameComps contains only the significant components (based on what the TLD is) - this function is intended to be used with getOne(). TO-DO: if any exception occurs _siteDBRWLock might still be locked in write mode! """ if len(nameComps): level = node[2] + 1 theDict = node[-1] if notNumerical: nameComps.reverse() _siteDBRWLock.acquire_write() for comp in nameComps[:-1]: if theDict is None: theDict = {} node[-1] = theDict node = theDict.setdefault(comp, [0, 0, level, {}]) theDict = node[-1] level += 1 if theDict is None: theDict = {} node[-1] = theDict node = theDict.setdefault(nameComps[-1], [0, 0, level, None]) else: _siteDBRWLock.acquire_write() node[1] += numTimes nodeTime = node[0] if nodeTime == 0: self._count += 1 node[0] = timeSec if self.sizeLimit and self._count > self.sizeLimit: self._prune() elif nodeTime < timeSec: node[0] = timeSec _siteDBRWLock.release() def _removeOne (self, site, countMatters = False): """Removes a site (a sequence of name components) from the collection: if countMatters is True, decrements the count by 1 (and removes the entire entry if count becomes 0); if not, removes the entire entry; in both cases the timestamp will not be updated. Returns True iff change did occur. ASSUMPTION: site is syntactically correct (no int(.) test on each component of an IP address). WARNING: NOT thread-safe - use removeOne() for that! """ tld = site[-1] theDict = self path = [] if tld in _gTLD: count = 2 elif tld in _ccTLD: count = 3 else: try: int(tld) count = 0 except: return False # find a path from the leaf to the root if count > 0: for comp in site[::-1]: node = theDict.get(comp) if node is None: return False else: path.append([comp, node]) count -= 1 if count == 0: break theDict = node[-1] if theDict is None: return False else: for comp in site: node = theDict.get(comp) if node is None: return False else: path.append([comp, node]) count += 1 if count == 4: break theDict = node[-1] if theDict is None: return False # remove the path, from bottom up if countMatters: key, node = path[-1] # decrement the count if node[1]: node[1] -= 1 else: # not a leaf node - don't do anything return False if node[1] == 0: # count is decremented to zero for this node, remove it self._count -= 1 for nextKey, node in path[-2::-1]: theDict = node[-1] del theDict[key] # if this node has other children, or itself represents a site # don't try to remove it from its parent's dictionary if len(theDict) or node[1]: break key = nextKey else: theDict = self del theDict[key] else: self._count -= 1 key = path[-1][0] for nextKey, node in path[-2::-1]: theDict = node[-1] del theDict[key] # if this node has other children, or itself represents a site # don't try to remove it from its parent's dictionary if len(theDict) or node[1]: break key = nextKey else: theDict = self del theDict[key] return True def removeOne (self, site, countMatters = False): """A thread-safe version of _removeOne().""" _siteDBRWLock.acquire_write() try: ret = self._removeOne(site, countMatters) except Exception, e: printException(u'Exception in SiteDB.removeOne().', e) _siteDBRWLock.release() return ret def _prune (self): """Recency-based pruning: throw away the oldest n records; NOT thread-safe (use prune() for that).""" # n is the # of entries we want to prune n = self._count - int(self.sizeLimit * SAFE_SITE_SIZE_RATIO) if n <= 0: return l = [[[siteInfo[1], siteInfo[2]], copy.copy(siteInfo[0])] for siteInfo in self._iter()] l.sort() # sorted by time, then # of occurrence, then names for i in l[:n]: self._removeOne(i[1]) def prune (self): """A thread-safe version of _prune().""" _siteDBRWLock.acquire_write() try: self._prune() except Exception, e: printException(u'Exception in SiteDB.prune().', e) _siteDBRWLock.release() def recycle (self): _siteDBRWLock.acquire_write() try: self.clear() self._count = 0 os.remove(self.fn) except Exception, e: printException(u'Exception in SiteDB.recycle()', e) _siteDBRWLock.release() def _iter (self): """Iterates over the tree. Returns a list [site name, time, # of occurrences], where site name is a list of name components, time is # of seconds since epoch when the last time the site is updated. WARNING: NOT THREAD-SAFE! """ stack = [[k,v] for k, v in self.items()] nameComps = [] while len(stack): node = stack[0] del stack[0] payload = node[-1] level = payload[2] if level == 0: del nameComps[:] elif level < len(nameComps): del nameComps[0:-level] nameComps.insert(0, node[0]) if payload[1]: # it's a site try: int(nameComps[-1]) yield [nameComps[::-1], payload[0], payload[1]] except: # non-numerical name yield [nameComps,payload[0], payload[1]] if payload[-1] is not None: # not a leaf node - push the children for k,v in payload[-1].items()[::-1]: stack.insert(0,[k,v]) def removeSitesUsingPattern (self, sitesPattern): """Removes any site that matches sitesPattern (a regex pattern); returns True iff there's any change.""" removeList = [] _siteDBRWLock.acquire_read() for siteInfo in self._iter(): siteName = '.'.join(siteInfo[0]) if sitesPattern.search(siteName): removeList.append(siteName) _siteDBRWLock.release() # it's possible some sites are already gone by now (since it's not locked here) # but it's okay - removeOne() is tolerant enough if len(removeList): _siteDBRWLock.acquire_write() try: noChange = True for site in removeList: if self._removeOne(site.split('.')) and noChange: noChange = False except Exception, e: printException(u'Exception in SiteDB.removeSitesUsingPattern()', e) _siteDBRWLock.release() return not noChange return False def writeToFile (self, fn = None): if fn: f = openFile(fn, 'w') else: f = openFile(self.fn, 'w') f.write(self.__str__()) def loadIntoList (self, siteList): """Load into siteList - a list of dict with keys 'name', 'time', and 'count'.""" _siteDBRWLock.acquire_read() tmpList = map(lambda siteInfo: (siteInfo[1], {'name':'.'.join(siteInfo[0]), 'time':datetime.datetime.utcfromtimestamp(siteInfo[1]), 'count':siteInfo[2]}), self._iter()) tmpList.sort() tmpList.reverse() siteList[:] = map(lambda i:i[1], tmpList) _siteDBRWLock.release() if __name__ == '__main__': siteDB = SiteDB('%ssiteDB' % CONF_PATH, 2000) l, n, b = siteDB.getOne(('xyz', 'com')) print l, n is not None, b l, n, b = siteDB.getOne(('xyz', 'abc', 'biz')) print l, n is not None, b l, n, b = siteDB.getOne(('xyz', 'abc', 'com', 'tw')) print l, n is not None, b l, n, b = siteDB.getOne(('xyz', 'abc', 'com', 'il')) print l, n is not None, b l, n, b = siteDB.getOne(('com', )) # so don't pass this to SiteDB! print l, n is not None, b l, n, b = siteDB.getOne(('rubbish', 'bogus')) print l, n is not None, b l, n, b = siteDB.getOne(('66', '0', '78', '20')) print l, n is not None, b print print '* before removal:', siteDB.size() print '* any "biz" site removed?', siteDB.removeSitesUsingPattern(re.compile(r'\.biz$')) print '* any "edu" site removed?', siteDB.removeSitesUsingPattern(re.compile(r'\.edu$')) print '* after removal:', siteDB.size() print siteDB.removeOne('24info.com.tw'.split('.')) s = str(siteDB) siteDB.loadFromString(s) # test writing the content to a file #siteDB.writeToFile('sites.out') #print '* Wrote %d sites to "sites.out".' % siteDB.size() #siteList = [] #siteDB.loadIntoList(siteList) #print '\n'.join(['%s (%d): %s' % (s['name'], s['count'], s['time']) # for s in siteList])